rm(list=ls())

library(readr)
library(dplyr)
library(quanteda)
library(stringi)
library(lubridate)
library(knitr)
library(kableExtra)

quanteda_options(threads = 60)

# newspaper list
list <- read_csv("data/list_newspapers_state.csv")

source_info <- list |> 
  dplyr::select(Code, Publication, Country) |> 
  dplyr::rename(source_code = Code,
         publication = Publication,
         country = Country)

all_data <- list()


# import rds files
for (i in seq_len(nrow(source_info))) {
  code <- source_info$source_code[i]
  publication <- source_info$publication[i]
  country <- source_info$country[i]

  file_path <- paste0("~/Documents/factiva/data/source_", code, ".rds")
  
  if (file.exists(file_path)) {
    dat <- readRDS(file_path)
    
    publication <- source_info$publication[source_info$source_code == code]
    country <- source_info$country[source_info$source_code == code]
    
    dat$Publication <- publication
    dat$Country <- country
    
    #dat$date <- as.numeric(dat$publication_date)
    #dat$date <- as.POSIXct(dat$date / 1000, origin = "1970-01-01")
    #dat$date <- as.Date(stri_match_first_regex(dat$date, "\\d{4}-\\d{2}-\\d{2}"))
    #dat$year <- year(dat$date)
    dat$month <- month(dat$date)
    
    if (!("docid" %in% names(dat)) || length(dat$docid) != nrow(dat)) {
      dat$docid <- paste0("doc_", seq_len(nrow(dat)))
    } else {
      dat$docid <- factor(dat$docid, levels = unique(dat$docid))
    }
    dat$doc_id <- dat$docid 
    
    all_data[[code]] <- dat
  } else {
    warning(paste("does not exist:", file_path))
  }
}


combined_data <- bind_rows(all_data)


# corpus by countries
for (country in unique(combined_data$Country)) {
  subset_df <- combined_data %>% filter(Country == country)
  subset_df$docid <- paste0(country, "_", seq_len(nrow(subset_df)))
  
  corp <- corpus(subset_df, text_field = "body", docid_field = "docid")
  saveRDS(corp, paste0("data/corpus_", country, ".RDS"))
}
